Importing Libraries
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(rlang)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%() masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke() masks rlang::invoke()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::splice() masks rlang::splice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(corrplot)
## corrplot 0.92 loaded
loading the Dataset
Cleaned_bitcoin_mining <- read.csv("Cleaned_bitcoin_mining.csv")
head(Cleaned_bitcoin_mining)
## Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## 1 2010-07-18T00:00:00 2.67e-05 2.24e-05 2.44e-05
## 2 2010-07-19T00:00:00 2.68e-05 2.26e-05 2.46e-05
## 3 2010-07-20T00:00:00 2.72e-05 2.29e-05 2.50e-05
## 4 2010-07-21T00:00:00 2.84e-05 2.39e-05 2.61e-05
## 5 2010-07-22T00:00:00 2.82e-05 2.37e-05 2.59e-05
## 6 2010-07-23T00:00:00 2.85e-05 2.40e-05 2.61e-05
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 1 0.000233717 0.000196712
## 2 0.000235075 0.000197855
## 3 0.000238699 0.000200905
## 4 0.000249343 0.000209864
## 5 0.000247305 0.000208148
## 6 0.000250023 0.000210436
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 1 0.000214241 14313700
## 2 0.000215486 14313700
## 3 0.000218808 14313700
## 4 0.000228565 14313700
## 5 0.000226696 14313700
## 6 0.000229188 14313700
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## 1 14313700 14313700 4e-06
## 2 14313700 14313700 5e-06
## 3 14313700 14313700 5e-06
## 4 14313700 14313700 5e-06
## 5 14313700 14313700 5e-06
## 6 14313700 14313700 5e-06
## Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 1 0.000119 0.000214 554.1215
## 2 0.000119 0.000216 554.1215
## 3 0.000121 0.000219 554.1215
## 4 0.000127 0.000229 554.1215
## 5 0.000126 0.000227 554.1215
## 6 0.000127 0.000229 554.1215
## Hash.rate.MH.s
## 1 0.001606373
## 2 0.001822962
## 3 0.001822962
## 4 0.001750766
## 5 0.001669545
## 6 0.001669545
Checking the dimension and Structure of data
dim(Cleaned_bitcoin_mining)
## [1] 4815 15
str(Cleaned_bitcoin_mining)
## 'data.frame': 4815 obs. of 15 variables:
## $ Date.and.Time : chr "2010-07-18T00:00:00" "2010-07-19T00:00:00" "2010-07-20T00:00:00" "2010-07-21T00:00:00" ...
## $ power.MAX..GW : num 2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
## $ power.MIN..GW : num 2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
## $ power.GUESS..GW : num 2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
## $ annualised.consumption.MAX..TWh : num 0.000234 0.000235 0.000239 0.000249 0.000247 ...
## $ annualised.consumption.MIN..TWh : num 0.000197 0.000198 0.000201 0.00021 0.000208 ...
## $ annualised.consumption.GUESS..TWh: num 0.000214 0.000215 0.000219 0.000229 0.000227 ...
## $ Lower.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Estimated.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Upper.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Hydro.only..MtCO2e : num 4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
## $ Estimated..MtCO2e : num 0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
## $ Coal.only..MtCO2e : num 0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
## $ Emission.intensity..gCO2e.kWh : num 554 554 554 554 554 ...
## $ Hash.rate.MH.s : num 0.00161 0.00182 0.00182 0.00175 0.00167 ...
Our Dataset contains 4,815 observations(rows) and 15
variables(columns). The structure of the bitcoin mining dataset reveals
information related to power consumption, efficiency, CO2 emissions, and
hash rates.
Summary Statistics
summary(Cleaned_bitcoin_mining)
## Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## Length:4815 Min. : 0.00003 Min. :0.000022 Min. : 0.000024
## Class :character 1st Qu.: 0.39179 1st Qu.:0.031152 1st Qu.: 0.154086
## Mode :character Median : 2.12457 Median :0.384142 Median : 0.905217
## Mean : 9.82974 Mean :2.039373 Mean : 3.989582
## 3rd Qu.:15.41883 3rd Qu.:4.049493 3rd Qu.: 7.710647
## Max. :56.01570 Max. :8.947454 Max. :15.063222
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## Min. : 0.0002 Min. : 0.0002
## 1st Qu.: 3.4344 1st Qu.: 0.2731
## Median : 18.6240 Median : 3.3674
## Mean : 86.1675 Mean :17.8771
## 3rd Qu.:135.1615 3rd Qu.:35.4978
## Max. :491.0337 Max. :78.4334
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## Min. : 0.00021 Min. : 21
## 1st Qu.: 1.35072 1st Qu.: 38
## Median : 7.93513 Median : 98
## Mean : 34.97267 Mean : 458086
## 3rd Qu.: 67.59153 3rd Qu.: 9917
## Max. :132.04420 Max. :14313700
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## Min. : 31 Min. : 46 Min. :0.000004
## 1st Qu.: 68 1st Qu.: 167 1st Qu.:0.028365
## Median : 261 Median : 766 Median :0.166638
## Mean : 771891 Mean : 1292594 Mean :0.734426
## 3rd Qu.: 36553 3rd Qu.: 75000 3rd Qu.:1.419422
## Max. :14313700 Max. :14313700 Max. :2.772928
## Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## Min. : 0.00012 Min. : 0.00021 Min. :359.5
## 1st Qu.: 0.75628 1st Qu.: 1.35207 1st Qu.:512.8
## Median : 4.22858 Median : 7.94307 Median :533.7
## Mean :17.95686 Mean : 35.00765 Mean :532.2
## 3rd Qu.:31.96006 3rd Qu.: 67.65912 3rd Qu.:559.0
## Max. :66.90830 Max. :132.17625 Max. :594.6
## Hash.rate.MH.s
## Min. : 0
## 1st Qu.: 3838
## Median : 3210303
## Mean : 64397862
## 3rd Qu.:111495251
## Max. :506061817
From the summary Statistics, we can see the distribution and range
of each variable, as well as the presence of missing values.
Data cleaning
Checking for missing values
sum(is.na(Cleaned_bitcoin_mining))
## [1] 0
There are No Null values as this is the Cleaned dataset, Every
column has complete data for all the rows.
Checking number of Unique values
sapply(Cleaned_bitcoin_mining, function(x) length(unique(x)))
## Date.and.Time power.MAX..GW
## 4815 4767
## power.MIN..GW power.GUESS..GW
## 4745 4771
## annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 4771 4750
## annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 4774 24
## Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th
## 275 44
## Hydro.only..MtCO2e Estimated..MtCO2e
## 4543 4757
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 4761 39
## Hash.rate.MH.s
## 3801
Date and time has 4815 unique values which means that each row
corresponds to a unique timestamp. Most of the columns have a large
number of unique values, suggesting continous data, but few columns like
” lower Bound eficiency, J/th”, “Upper bound efficiency, J/th”, and
“Emission intensity, gCO2e/kWh” have fewer values, indicating potential
categories or repeated measurements.
Changing of “data and time” datatype
Cleaned_bitcoin_mining$'Date.and.Time' <- as.POSIXct(Cleaned_bitcoin_mining$'Date.and.Time',format= "%Y-%m-%dT%H:%M:%S")
str(Cleaned_bitcoin_mining)
## 'data.frame': 4815 obs. of 15 variables:
## $ Date.and.Time : POSIXct, format: "2010-07-18" "2010-07-19" ...
## $ power.MAX..GW : num 2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
## $ power.MIN..GW : num 2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
## $ power.GUESS..GW : num 2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
## $ annualised.consumption.MAX..TWh : num 0.000234 0.000235 0.000239 0.000249 0.000247 ...
## $ annualised.consumption.MIN..TWh : num 0.000197 0.000198 0.000201 0.00021 0.000208 ...
## $ annualised.consumption.GUESS..TWh: num 0.000214 0.000215 0.000219 0.000229 0.000227 ...
## $ Lower.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Estimated.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Upper.bound.efficiency..J.Th : num 14313700 14313700 14313700 14313700 14313700 ...
## $ Hydro.only..MtCO2e : num 4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
## $ Estimated..MtCO2e : num 0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
## $ Coal.only..MtCO2e : num 0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
## $ Emission.intensity..gCO2e.kWh : num 554 554 554 554 554 ...
## $ Hash.rate.MH.s : num 0.00161 0.00182 0.00182 0.00175 0.00167 ...
class(Cleaned_bitcoin_mining$Date.and.Time)
## [1] "POSIXct" "POSIXt"
date_range <- range(Cleaned_bitcoin_mining$Date.and.Time)
date_range
## [1] "2010-07-18 EDT" "2023-09-22 EDT"
we are changing the data and time datatype to POSIXct as many
plotting functions understand ’POSIXct/ POSIXit and will correctly
format axes and labels when ploting datetime values, and is better for
data manipulations and operations.
Univariate Analysis
variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th',
'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e',
'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')
# Improve variable names for display
var_names <- c('Power (GW)', 'Annualised Consumption (TWh)', 'Estimated Efficiency (J/Th)',
'Hydro Only Emissions (MtCO2e)', 'Estimated Emissions (MtCO2e)', 'Coal Only Emissions (MtCO2e)',
'Emission Intensity (gCO2e/kWh)', 'Hash Rate (MH/s)')
# Convert data to long format for facetting
df_long <- Cleaned_bitcoin_mining %>%
select(all_of(variables)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
df_long$Variable <- factor(df_long$Variable, levels = variables, labels = var_names)
# Plot
p <- ggplot(df_long, aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
facet_wrap(~ Variable, scales = "free", ncol = 2) +
theme_minimal() +
labs(title = "Histograms of Selected Variables", y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

for(i in 1:length(variables)) {
# Subset data for the variable
df_subset <- df_long[df_long$Variable == var_names[i], ]
p <- ggplot(df_subset, aes(x = Value)) +
geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
labs(title = paste("Histogram of", var_names[i]), y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers
Boxplots to visualize outliers
for(i in 1:length(variables)) {
p <- ggplot(Cleaned_bitcoin_mining, aes(y = Cleaned_bitcoin_mining[[variables[i]]])) +
geom_boxplot(fill = '#66c2a5', color = '#004d40', outlier.color = "red", outlier.size = 2) +
labs(title = paste("Box Plot of", var_names[i]), y = var_names[i]) +
theme_minimal()
print(p)
}
## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

IQR
variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th',
'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e',
'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')
outliers_counts <- sapply(variables, function(var) {
Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- Cleaned_bitcoin_mining[[var]][Cleaned_bitcoin_mining[[var]] < lower_bound |
Cleaned_bitcoin_mining[[var]] > upper_bound]
length(outliers)
})
names(outliers_counts) <- variables
outliers_counts
## power.GUESS..GW annualised.consumption.GUESS..TWh
## 0 0
## Estimated.efficiency..J.Th Hydro.only..MtCO2e
## 1097 0
## Estimated..MtCO2e Coal.only..MtCO2e
## 0 0
## Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## 214 254
Cap/Floor Outliers
Cleaned_bitcoin_mining_copy <- Cleaned_bitcoin_mining
for(var in variables) {
Q1 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.25)
Q3 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
Cleaned_bitcoin_mining_copy[[var]] <- ifelse(Cleaned_bitcoin_mining_copy[[var]] < lower_bound, lower_bound,
ifelse(Cleaned_bitcoin_mining_copy[[var]] > upper_bound, upper_bound,
Cleaned_bitcoin_mining_copy[[var]]))
}
summary(Cleaned_bitcoin_mining_copy[variables])
## power.GUESS..GW annualised.consumption.GUESS..TWh
## Min. : 0.000024 Min. : 0.00021
## 1st Qu.: 0.154086 1st Qu.: 1.35072
## Median : 0.905217 Median : 7.93513
## Mean : 3.989582 Mean : 34.97267
## 3rd Qu.: 7.710647 3rd Qu.: 67.59153
## Max. :15.063222 Max. :132.04420
## Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e
## Min. : 31.13 Min. :0.000004 Min. : 0.00012
## 1st Qu.: 67.72 1st Qu.:0.028365 1st Qu.: 0.75628
## Median : 260.92 Median :0.166638 Median : 4.22858
## Mean :23180.17 Mean :0.734426 Mean :17.95686
## 3rd Qu.:36553.00 3rd Qu.:1.419422 3rd Qu.:31.96006
## Max. :91280.91 Max. :2.772928 Max. :66.90830
## Coal.only..MtCO2e Emission.intensity..gCO2e.kWh Hash.rate.MH.s
## Min. : 0.00021 Min. :443.5 Min. : 0
## 1st Qu.: 1.35207 1st Qu.:512.8 1st Qu.: 3838
## Median : 7.94307 Median :533.7 Median : 3210303
## Mean : 35.00765 Mean :534.2 Mean : 60413666
## 3rd Qu.: 67.65912 3rd Qu.:559.0 3rd Qu.:111495251
## Max. :132.17625 Max. :594.6 Max. :278732371
for (var in variables) {
p <- ggplot(Cleaned_bitcoin_mining_copy, aes_string(x = var)) +
geom_histogram(aes(y = ..count..),fill = '#66c2a5', color = '#004d40', bins = 30) +
geom_freqpoly(color = "#e34a33", size = 1) +
labs(title = paste("Histogram of", var, "after Capping/Flooring"), y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
